{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation of the new oversampler on the standard database foldings\n",
    "\n",
    "In this notebook we give an example evaluating a new oversampler on the standard 104 imbalanced datasets. The evaluation is highly similar to that illustrated in the notebook ```002_evaluation_multiple_datasets``` with the difference that in this case some predefined dataset foldings are used to make the results comparable to those reported in the ranking page of the documentation. The database foldings need to be downloaded from the github repository and placed in the 'smote_foldings' directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, pickle, itertools\n",
    "\n",
    "# import classifiers\n",
    "from sklearn.calibration import CalibratedClassifierCV\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from smote_variants import MLPClassifierWrapper\n",
    "\n",
    "# import SMOTE variants\n",
    "import smote_variants as sv\n",
    "\n",
    "# itertools to derive imbalanced databases\n",
    "import imbalanced_databases as imbd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# setting global parameters\n",
    "\n",
    "folding_path= os.path.join(os.path.expanduser('~'), 'smote_foldings')\n",
    "\n",
    "if not os.path.exists(folding_path):\n",
    "    os.makedirs(folding_path)\n",
    "    \n",
    "max_sampler_parameter_combinations= 35\n",
    "n_jobs= 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# instantiate classifiers\n",
    "\n",
    "# Support Vector Classifiers with 6 parameter combinations\n",
    "sv_classifiers= [CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l1', loss= 'squared_hinge', dual= False)),\n",
    "                 CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'hinge', dual= True)),\n",
    "                 CalibratedClassifierCV(LinearSVC(C=1.0, penalty='l2', loss= 'squared_hinge', dual= False)),\n",
    "                 CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l1', loss= 'squared_hinge', dual= False)),\n",
    "                 CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'hinge', dual= True)),\n",
    "                 CalibratedClassifierCV(LinearSVC(C=10.0, penalty='l2', loss= 'squared_hinge', dual= False))]\n",
    "\n",
    "# Multilayer Perceptron Classifiers with 6 parameter combinations\n",
    "mlp_classifiers= []\n",
    "for x in itertools.product(['relu', 'logistic'], [1.0, 0.5, 0.1]):\n",
    "    mlp_classifiers.append(MLPClassifierWrapper(activation= x[0], hidden_layer_fraction= x[1]))\n",
    "\n",
    "# Nearest Neighbor Classifiers with 18 parameter combinations\n",
    "nn_classifiers= []\n",
    "for x in itertools.product([3, 5, 7], ['uniform', 'distance'], [1, 2, 3]):\n",
    "    nn_classifiers.append(KNeighborsClassifier(n_neighbors= x[0], weights= x[1], p= x[2]))\n",
    "\n",
    "# Decision Tree Classifiers with 6 parameter combinations\n",
    "dt_classifiers= []\n",
    "for x in itertools.product(['gini', 'entropy'], [None, 3, 5]):\n",
    "    dt_classifiers.append(DecisionTreeClassifier(criterion= x[0], max_depth= x[1]))\n",
    "\n",
    "classifiers= []\n",
    "classifiers.extend(sv_classifiers)\n",
    "classifiers.extend(mlp_classifiers)\n",
    "classifiers.extend(nn_classifiers)\n",
    "classifiers.extend(dt_classifiers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# querying datasets for the evaluation\n",
    "\n",
    "datasets= imbd.get_data_loaders('study')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2019-06-11 18:25:54,178:INFO:dataset: CM1, samplings_available: False, evaluations_available: False\n",
      "2019-06-11 18:25:54,179:INFO:doing the folding\n",
      "2019-06-11 18:25:54,179:INFO:Folding reading from file folding_CM1.pickle\n",
      "2019-06-11 18:25:54,181:INFO:do the samplings\n",
      "2019-06-11 18:25:54,181:INFO:create sampling objects\n",
      "2019-06-11 18:25:54,215:INFO:executing 42 sampling in parallel\n",
      "2019-06-11 18:25:56,082:INFO:do the evaluations\n",
      "2019-06-11 18:25:56,083:INFO:create classifier jobs\n",
      "2019-06-11 18:25:56,339:INFO:executing 42 evaluation jobs in parallel\n",
      "2019-06-11 18:34:29,776:INFO:removing unnecessary sampling files\n",
      "2019-06-11 18:34:29,796:INFO:concatenating the results\n",
      "2019-06-11 18:34:30,106:INFO:aggregating the results\n",
      "2019-06-11 18:34:30,392:INFO:dataset: german, samplings_available: False, evaluations_available: False\n",
      "2019-06-11 18:34:30,393:INFO:doing the folding\n",
      "2019-06-11 18:34:30,394:INFO:Folding reading from file folding_german.pickle\n",
      "2019-06-11 18:34:30,397:INFO:do the samplings\n",
      "2019-06-11 18:34:30,398:INFO:create sampling objects\n",
      "2019-06-11 18:34:30,399:INFO:executing 42 sampling in parallel\n",
      "2019-06-11 18:34:32,319:INFO:do the evaluations\n",
      "2019-06-11 18:34:32,320:INFO:create classifier jobs\n",
      "2019-06-11 18:34:32,760:INFO:executing 42 evaluation jobs in parallel\n",
      "2019-06-11 18:51:18,596:INFO:removing unnecessary sampling files\n",
      "2019-06-11 18:51:18,622:INFO:concatenating the results\n",
      "2019-06-11 18:51:18,814:INFO:aggregating the results\n",
      "2019-06-11 18:51:18,926:INFO:dataset: hepatitis, samplings_available: False, evaluations_available: False\n",
      "2019-06-11 18:51:18,927:INFO:doing the folding\n",
      "2019-06-11 18:51:18,927:INFO:Folding reading from file folding_hepatitis.pickle\n",
      "2019-06-11 18:51:18,950:INFO:do the samplings\n",
      "2019-06-11 18:51:18,950:INFO:create sampling objects\n",
      "2019-06-11 18:51:18,952:INFO:executing 42 sampling in parallel\n",
      "2019-06-11 18:51:19,336:INFO:do the evaluations\n",
      "2019-06-11 18:51:19,337:INFO:create classifier jobs\n",
      "2019-06-11 18:51:19,661:INFO:executing 42 evaluation jobs in parallel\n",
      "2019-06-11 18:53:59,096:INFO:removing unnecessary sampling files\n",
      "2019-06-11 18:53:59,102:INFO:concatenating the results\n",
      "2019-06-11 18:53:59,378:INFO:aggregating the results\n",
      "2019-06-11 18:53:59,645:INFO:dataset: hypothyroid, samplings_available: False, evaluations_available: False\n",
      "2019-06-11 18:53:59,646:INFO:doing the folding\n",
      "2019-06-11 18:53:59,647:INFO:Folding reading from file folding_hypothyroid.pickle\n",
      "2019-06-11 18:53:59,802:INFO:do the samplings\n",
      "2019-06-11 18:53:59,803:INFO:create sampling objects\n",
      "2019-06-11 18:53:59,804:INFO:executing 42 sampling in parallel\n",
      "2019-06-11 18:54:09,386:INFO:do the evaluations\n",
      "2019-06-11 18:54:09,387:INFO:create classifier jobs\n",
      "2019-06-11 18:54:09,822:INFO:executing 42 evaluation jobs in parallel\n",
      "/home/gykovacs/anaconda3/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n",
      "  \"timeout or by a memory leak.\", UserWarning\n"
     ]
    }
   ],
   "source": [
    "# executing the evaluation\n",
    "\n",
    "results= sv.evaluate_oversamplers(datasets,\n",
    "                                  samplers= sv.get_all_oversamplers(),\n",
    "                                  classifiers= classifiers,\n",
    "                                  cache_path= folding_path,\n",
    "                                  n_jobs= n_jobs,\n",
    "                                  remove_sampling_cache= True,\n",
    "                                  max_samp_par_comb= max_sampler_parameter_combinations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The evaluation results are available in the results dataframe for further analysis.\n",
    "\n",
    "print(results)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
